In [1]:
from shared import graph, stats_utils
from src_draft.utils import LOW_IMP_FEATURES
import shared.ml_config_core as ml_config_core
import pandas as pd
from shared.ml_config_core import ModelConfigsCollection
from shared.ml_config_runner import run_tuning_for_configs_collection
from shared.definitions import TuningResult
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from pandas import CategoricalDtype
from Draft import feature_builder_v2

import importlib
from matplotlib import pyplot as plt
import src_draft.utils as shared_utils
import seaborn as sns
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\sklearn\metrics\_scorer.py:548: FutureWarning: The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.
  warnings.warn(
In [2]:
importlib.reload(shared_utils)

shared_utils.pandas_config(pd)
shared_utils.plt_config(plt)

sns.set_theme(style="darkgrid", palette="pastel")
plt.style.use("fivethirtyeight")
In [3]:
features_matrix = feature_builder_v2.load_datasets_and_prepare_features(drop_meta_data=True,
                                                                        ds_type=feature_builder_v2.DatasetType.FULL)
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\featuretools\entityset\entityset.py:1914: UserWarning: index SK_BUREAU_ID not found in dataframe, creating new integer column
  warnings.warn(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\woodwork\type_sys\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\featuretools\computational_backends\feature_set_calculator.py:828: FutureWarning: The provided callable <function min at 0x0000019C5C314400> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.
  ).agg(to_agg)
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\featuretools\computational_backends\feature_set_calculator.py:828: FutureWarning: The provided callable <function mean at 0x0000019C5C314CC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  ).agg(to_agg)
C:\Users\Paulius\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\featuretools\computational_backends\feature_set_calculator.py:828: FutureWarning: The provided callable <function max at 0x0000019C5C3142C0> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
  ).agg(to_agg)
Appending previous history
Full DS size: 307511
In [4]:
conditions = [
    features_matrix["PrevRatioRejectedAccepted"].isna(),
    features_matrix["PrevRatioRejectedAccepted"] == 0,
    features_matrix["PrevRatioRejectedAccepted"] <= 0.25,
    features_matrix["PrevRatioRejectedAccepted"] > 0.25
]

conditions_2 = [
    features_matrix["PrevRatioRejectedAccepted"].isna(),
    features_matrix["PrevRatioRejectedAccepted"] == 0,
    features_matrix["PrevRatioRejectedAccepted"] > 0,
]


choices = ["No Previous App.", 'All Accepted', "< 25% Rejected", "> 25% Rejected"]
choices_2 = ["No Previous App.", 'All Accepted', "> 0% Rejected"]
# choices = ['All Accepted', "> 0 Rejected"]
# choices = ['No Previous', '0', '> 0']

features_matrix["PrevRatioRejectedAccepted_cats"] = np.select(conditions, choices, default='No Previous App')
features_matrix["PrevRatioRejectedAccepted_cats_2"] = np.select(conditions, choices, default='No Previous App')

features_matrix["PrevRatioRejectedAccepted_cats"] = features_matrix["PrevRatioRejectedAccepted_cats"].astype("category")
features_matrix["PrevRatioRejectedAccepted_cats_2"] = features_matrix["PrevRatioRejectedAccepted_cats_2"].astype("category")
In [4]:
 
In [5]:
stats_utils.nan_summary(features_matrix[["PrevRatioRejectedAccepted"]])
Out[5]:
Total NaN Values Proportion NaN (%)
PrevRatioRejectedAccepted 16847 5.0

Exploratory Analysis¶

This notebooks includes the analysis of selected variables (based on their importance at predicting the target variable) and their relationships. Individual analysis of each variable is available in the EDA_appendices notebook.

In [6]:
add_features = ["PrevRatioRejectedAccepted_cats", "PrevRatioRejectedAccepted_cats_2", "TARGET"]
features_matrix_only_high_imp = features_matrix[shared_utils.HIGH_IMP_FEATURES + add_features]
features_matrix_any_imp = features_matrix[shared_utils.ANY_IMP_FEATURES + add_features]
In [7]:
# TODO impute missing values, either 
stats_utils.nan_summary(features_matrix_only_high_imp)
Out[7]:
Total NaN Values Proportion NaN (%)
ExtSource2 660 0.0
ExtSource3 60965 20.0
ExtSource1 173378 56.0
AmtGoodsPrice 278 0.0
OwnCarAge 202929 66.0
PrevAmtDownPaymentSum 16454 5.0
AmtAnnuity 12 0.0
MeanbureaudaysCredit 44020 14.0
MeanbureauamtCreditSumDebt 51380 17.0
PrevAvgYieldGroup 18945 6.0
PrevCreditReceivedRequestedDiff 16454 5.0
OccupationType 96391 31.0
PrevRatioRejectedAccepted 16847 5.0
MaxbureaudaysCreditEnddate 46269 15.0
PrevLastLoanGoodsCategory 16454 5.0
MeanbureauamtCreditMaxOverdue 123625 40.0
In [7]:
 
In [8]:
# TODO impute missing values (mean for numerical, proportion sampling for cat)
# OR inside correlation check just drop rows with missing values for tested columns
importlib.reload(graph)

features_matrix_any_imp_no_nan = features_matrix_only_high_imp.dropna(axis=0, how="any")
features_matrix_any_imp_no_nan = features_matrix_any_imp_no_nan.apply(
    lambda col: col.astype(float) if col.dtype == 'Float64' else col.astype(int) if col.dtype == 'Int64' else col)

graph.render_corr_matrix_based_on_type(features_matrix_any_imp_no_nan)
V:\projects\ppuodz-ML.4.1\shared\graph.py:1269: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  corr = round(corr.applymap(pd.to_numeric), 2)
No description has been provided for this image

The TARGET variable (loans with payment difficulties) is most correlated with credit ratings obtained from external sources. The correlation is very weak but still significant.

In [9]:
correlation_results = []
for col in features_matrix_any_imp_no_nan.columns:
    if col == "TARGET":
        continue
    x = features_matrix_any_imp_no_nan["TARGET"]
    y = features_matrix_any_imp_no_nan[col]

    corr_value, p_value = stats_utils.correlation_test(x, y)
    
    if p_value < 0.05:
        correlation_results.append({'Column': col, 'Coefficient': corr_value, 'P-Value': p_value})

correlation_df = pd.DataFrame(correlation_results).set_index('Column')
correlation_df = correlation_df.loc[correlation_df['Coefficient'].abs().sort_values(ascending=False).index]
correlation_df.round(3)
Out[9]:
Coefficient P-Value
Column
ExtSource3 -0.161 0.000
ExtSource1 -0.131 0.000
ExtSource2 -0.128 0.000
MeanbureaudaysCredit 0.093 0.000
OccupationType 0.075 0.000
DaysEmployed 0.074 0.000
PrevRatioRejectedAccepted 0.073 0.000
PrevRatioRejectedAccepted_cats 0.072 0.000
PrevRatioRejectedAccepted_cats_2 0.072 0.000
OrganizationType 0.069 0.000
NameEducationType 0.067 0.000
PrevAmtDownPaymentSum -0.057 0.000
PrevCreditReceivedRequestedDiff 0.055 0.000
DaysBirth 0.053 0.000
PrevLastLoanGoodsCategory 0.051 0.000
OwnCarAge 0.050 0.000
MeanbureauamtCreditSumDebt 0.049 0.000
MeanbureauamtCreditMaxOverdue 0.044 0.000
DaysIdPublish 0.042 0.000
CodeGender 0.041 0.000
PrevAvgYieldGroup 0.040 0.000
FlagDocument3 0.039 0.000
AmtGoodsPrice -0.034 0.000
MaxbureaudaysCreditEnddate 0.034 0.000
NameFamilyStatus 0.027 0.002
AmtCredit -0.023 0.001
In [9]:
 

`` Because the datatypes of features vary we had to use different methods to measure the strength and significance of each pair:

  • Chi-Squared Test: Assesses independence between two categorical variables. For bool-bool pairs due to categorical nature.

  • Point Biserial Correlation: Measures correlation between a binary and a continuous variable. For bool-numerical pairs to account for mixed data types.

  • Spearman's Rank Correlation: Assesses monotonic relationship between two continuous variables. Used for numerical-numerical pairs (for non-normally distributed data).

Since the Chi-Squared test outputs an unbound statistic/value which can't be directly compared to pointbiserialr or Spearman Rank we have converted them to a Cramér's V: value which is normalized between 0 and 1. This was done to make the values in the matrix more uniform however we must note that Cramér's V and Spearman's correlation coefficients are fundamentally different statistics and generally can't be directly compared.

In [9]:
 
In [9]:
 
In [10]:
features_matrix_only_imp_cat_cols = features_matrix_only_high_imp.select_dtypes(include='category').columns
features_matrix_target_cat = features_matrix_only_high_imp.copy()
features_matrix_target_cat["TARGET"] = features_matrix_target_cat["TARGET"].map(
    lambda x: "Default/Loan With Issues" if x == 1 else "No Issues")
In [10]:
 
In [11]:
features_matrix_target_cat["PrevRatioRejectedAccepted_cats"].dtype
Out[11]:
CategoricalDtype(categories=['< 25% Rejected', '> 25% Rejected', 'All Accepted', 'No Previous App.'], ordered=False, categories_dtype=object)

The chart below shows the relationship between selected categorical variables and loan status. E.g. a significantly higher proportion of loans taken out by males had issues.

In [12]:
importlib.reload(graph)
graph.draw_distribution_pie_charts(
    features_matrix_target_cat,
    split_var="TARGET",
    include_cols=features_matrix_only_imp_cat_cols,
    title="Distribution of Categorical Variables Relative to Loan Risk",
    clean_tick_label = False,
)
No description has been provided for this image
In [13]:
features_matrix_with_bins = features_matrix_only_high_imp.copy()
numerical_cols = features_matrix_only_high_imp.select_dtypes(
    include=["int64", "float64", "Int64"]
).columns

for col in numerical_cols:
    if features_matrix_with_bins[col].nunique() < 5:
        features_matrix_with_bins[f"{col}_binned"] = features_matrix_with_bins[col].astype("category")
    else:
        features_matrix_with_bins[f"{col}_binned"] = stats_utils.bin_and_label(
            features_matrix_with_bins[col], num_bins=4
        )
    features_matrix_with_bins[col] = features_matrix_with_bins[col]
import numpy as np


conditions = [
    features_matrix["TotalDefaults"] == 0,
    features_matrix["TotalDefaults"] >= 1,
    # features_matrix["TotalDefaults"] > 1
]

choices = ["No Defaults", '1 Defaulted Loans']  #,"> 1 defaulted loan"]
# choices = ['All Accepted', "> 0 Rejected"]
# choices = ['No Previous', '0', '> 0']

features_matrix_with_bins["TotalDefaults_cats"] = np.select(conditions, choices, default='WTF?').astype("object")

features_matrix_with_bins["Defaulted"] = features_matrix_with_bins["TARGET"].map(lambda x: "Yes" if x == 1 else "No")
features_matrix_with_bins.drop(columns=["TARGET", "TARGET_binned"], inplace=True)
In [14]:
features_matrix_with_bins["PrevRatioRejectedAccepted_cats"].dtype
Out[14]:
CategoricalDtype(categories=['< 25% Rejected', '> 25% Rejected', 'All Accepted', 'No Previous App.'], ordered=False, categories_dtype=object)

Relationships Between Numerical and Categorical Variables¶

The charts below show pairs of numerical and categorical features (including some binned numerical features) that have a signficant relationships and at least a small effect size (eta_squared>0.01) based on the non-parametric Kruskal-Wallis Test (one-way ANOVA on ranks) testing whether samples originate from the same distribution.

*It's similar to the Mann–Whitney U test but allows comparing more than 2 groups

In [15]:
importlib.reload(graph)
for target_y in ["ExtSource2", "AmtCredit", "DaysEmployed"]:
    for c in features_matrix_with_bins.columns:
        if pd.api.types.is_numeric_dtype(features_matrix_with_bins[c]):
            continue

        if target_y in c and "binned" in c:
            continue
            
        if "ExtSource" in target_y and "ExtSource" in c:
            continue
            
        # Drop cols with to many categories
        if features_matrix_with_bins[c].nunique() > 10:
            continue
        # if VERBOSE:
        #     display(f"{c} vs {target_y}")
        res = graph.boxen_plot_by_cat(c, features_matrix_with_bins, target_y)
        
        if res:
            display(res)
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image
V:\projects\ppuodz-ML.4.1\shared\graph.py:1470: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = _df.groupby(c)[y_target]
V:\projects\ppuodz-ML.4.1\shared\graph.py:1483: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  group_counts = _df.groupby(c).size()
No description has been provided for this image

External Credit Scores (ExtSource1)¶

In [16]:
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# Plot setup
plt.figure(figsize=(12, 6))

line_styles = ['--', ':', '-.']

x_range = np.linspace(features_matrix[['ExtSource1', 'ExtSource2', 'ExtSource3']].min().min(),
                      features_matrix[['ExtSource1', 'ExtSource2', 'ExtSource3']].max().max(), 100)

# Initialize lists for storing predictions
predictions = {}

colors = plt.cm.get_cmap('tab10', 4)

for i, source in enumerate(['ExtSource1', 'ExtSource2', 'ExtSource3']):
    subset = features_matrix[[source, 'TARGET']].dropna()
    X = sm.add_constant(subset[source])
    y = subset['TARGET']

    model = sm.Logit(y, X).fit(disp=0)
    X_pred = pd.DataFrame({'const': 1, source: x_range})

    y_pred = model.predict(X_pred)
    predictions[source] = y_pred

    plt.plot(x_range, y_pred, color=colors(i), linestyle=line_styles[i], alpha=0.5, label=f'{source} (individual)')

combined_features = features_matrix[['ExtSource1', 'ExtSource2', 'ExtSource3', 'TARGET']].dropna()
X_combined = sm.add_constant(combined_features[['ExtSource1', 'ExtSource2', 'ExtSource3']])
y_combined = combined_features['TARGET']

model_combined = sm.Logit(y_combined, X_combined).fit(disp=0)
X_pred_combined = pd.DataFrame({'const': 1, 'ExtSource1': x_range, 'ExtSource2': x_range, 'ExtSource3': x_range})

y_pred_combined = model_combined.predict(X_pred_combined)
y_pred_combined_for_metrics = model_combined.predict(X_combined)

predictions['Combined'] = y_pred_combined

residuals_combined = y_combined - model_combined.predict(X_combined)
residual_std_combined = np.std(residuals_combined)

combined_color = colors(3)  # Selecting the fourth color for the combined model
plt.plot(x_range, y_pred_combined, color=combined_color,
         label='Combined - Predicted Default Probability')  # Solid line for combined model
plt.fill_between(x_range, y_pred_combined - residual_std_combined, y_pred_combined + residual_std_combined,
                 color=combined_color, alpha=0.2)

auc_combined = roc_auc_score(y_combined, y_pred_combined_for_metrics)
accuracy_combined = accuracy_score(y_combined, y_pred_combined_for_metrics.round())  # Assuming binary classification
logloss_combined = log_loss(y_combined, y_pred_combined_for_metrics)

metrics = f"AUC: {auc_combined:.2f}, Accuracy: {accuracy_combined:.2f}, Log-loss: {logloss_combined:.2f}"
plt.annotate(metrics, xy=(0.01, -0.175), xycoords='axes fraction', fontsize=14, color='black')

plt.title('Predicted Probability of Default by Credit Score Source\n(Logit)')
plt.xlabel('Normalized Credit Score')
plt.ylabel('Probability of Default')
plt.legend()
plt.show()
C:\Users\Paulius\AppData\Local\Temp\ipykernel_35252\2151574185.py:16: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  colors = plt.cm.get_cmap('tab10', 4)
No description has been provided for this image
In [17]:
model_params = model_combined.params
p_values = model_combined.pvalues
conf_int = model_combined.conf_int()
std_errors = model_combined.bse

coeff_df = pd.DataFrame({
    'Coefficient': model_params,
    'Standard Error': std_errors,
    'P-Value': p_values,
    'Conf. Interval Lower': conf_int[0],
    'Conf. Interval Upper': conf_int[1]
})
coeff_df.round(3)
Out[17]:
Coefficient Standard Error P-Value Conf. Interval Lower Conf. Interval Upper
const 0.600 0.040 0.0 0.521 0.680
ExtSource1 -2.099 0.061 0.0 -2.219 -1.979
ExtSource2 -1.964 0.060 0.0 -2.082 -1.846
ExtSource3 -2.779 0.062 0.0 -2.902 -2.657

Normalized credit ratings from three sources are inversely related to default risk, with ExtSource3 having the strongest influence. We can see that a basic Logistic model can already provide a reasonably high result (AUC = 0.74). However, we have to note that the results are based on the full training set and are only provided for EDA/feature analysis purposes. Full statistical modelling will be done in further sections.

In [18]:
# Plotting
plt.figure(figsize=(12, 6))
for i in range(1, 2):
    col = f'ExtSource{i}'
    sns.kdeplot(data=features_matrix[features_matrix['TARGET'] == 1][col], label=f'{col} - Default', shade=True)
    sns.kdeplot(data=features_matrix[features_matrix['TARGET'] == 0][col], label=f'{col} - No Default', shade=True)

plt.title('Density Plot of ExtSource Scores by Default Status')
plt.xlabel('Normalized Credit Score')
plt.ylabel('Density')
plt.legend()
plt.show()
C:\Users\Paulius\AppData\Local\Temp\ipykernel_35252\3397180186.py:5: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=features_matrix[features_matrix['TARGET'] == 1][col], label=f'{col} - Default', shade=True)
C:\Users\Paulius\AppData\Local\Temp\ipykernel_35252\3397180186.py:6: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=features_matrix[features_matrix['TARGET'] == 0][col], label=f'{col} - No Default', shade=True)
No description has been provided for this image
In [19]:
from scipy.stats import gaussian_kde
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

df = features_matrix[['ExtSource1', 'ExtSource2', 'ExtSource3', "TARGET", "AmtCredit"]].copy()

# Calculate the average of all ExtSources
df['ExtSourceAvg'] = df[['ExtSource1', 'ExtSource2', 'ExtSource3']].mean(axis=1, skipna=True)

# sources = ['ExtSource1']#, 'ExtSource2', 'ExtSource3', 'ExtSourceAvg']
sources = ['ExtSource1', 'ExtSource2', 'ExtSource3', 'ExtSourceAvg', 'AmtCredit']

for source in sources:
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
    fig.suptitle(f'Analysis for {source}', fontsize=16, y=1.05)  # Top-level title

    # Separate the data
    subset_default = df[df['TARGET'] == 1][source].dropna()
    subset_non_default = df[df['TARGET'] == 0][source].dropna()

    # Total number of observations with valid data
    total_count = len(df[source].dropna())

    # Define the range for the KDE
    score_range = np.linspace(df[source].min(), df[source].max(), 300)

    # KDE for defaults
    kde_default = gaussian_kde(subset_default, bw_method='silverman')
    density_default = kde_default(score_range) * len(subset_default) / total_count

    # KDE for non-defaults
    kde_non_default = gaussian_kde(subset_non_default, bw_method='silverman')
    density_non_default = kde_non_default(score_range) * len(subset_non_default) / total_count

    # Plotting
    # TODO: add fill with alpha like kde plots
    sns.lineplot(x=score_range, y=density_default, ax=ax1, label='Default Probability')
    sns.lineplot(x=score_range, y=density_non_default, ax=ax1, label='Non Default Probability')

    ax1.set_title(f'KDE', fontsize=10)  # Smaller font size for subplot title
    ax1.set_xlabel('Normalized Credit Score')
    ax1.set_ylabel('Density')
    ax1.legend()

    # Regression Plot
    subset = df[[source, 'TARGET']].dropna()

    sns.kdeplot(
        data=subset,
        x=source,
        hue="TARGET",
        # kind="kde",
        # height=6,
        multiple="fill",
        ax=ax2
        # clip=(10, 80),
    )
    # plt.title("Default Rate and EXT_SOURCE_1", x=0.5, y=1.025, fontdict={"size": 16})

    ax2.set_xlabel('Normalized Credit Score')
    ax2.set_ylabel('Probability of Default')
    # ax2.legend()

    # ROC AUC as annotation
    # roc_auc = roc_auc_score(y, model.predict(X))
    # ax2.annotate(f'ROC AUC: {roc_auc:.2f}', xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, verticalalignment='top')

    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [19]:
 
In [19]:
 
In [19]:
 
In [19]:
 
In [19]:
 
In [19]:
 
In [19]:
 
In [19]:
 
In [19]:
 
In [20]:
features_matrix_with_bins["PrevRatioRejectedAccepted_cats"].value_counts()
Out[20]:
PrevRatioRejectedAccepted_cats
All Accepted        190370
> 25% Rejected       66215
< 25% Rejected       34079
No Previous App.     16847
Name: count, dtype: int64
In [21]:
features_matrix_with_bins["TotalDefaults_cats"].value_counts()
Out[21]:
TotalDefaults_cats
No Defaults          304114
1 Defaulted Loans      3397
Name: count, dtype: int64
In [22]:
features_matrix_with_bins["TotalDefaults"].value_counts()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'TotalDefaults'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[22], line 1
----> 1 features_matrix_with_bins["TotalDefaults"].value_counts()

File ~\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\pandas\core\frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File ~\AppData\Local\pypoetry\Cache\virtualenvs\ppuodz-ml-4-1-dqELbViF-py3.12\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'TotalDefaults'
In [ ]:
features_matrix["PrevRatioRejectedAccepted"].describe()

Previous Loan History and Default Risk¶

The chart below shows the default rate based on whether applicant has previous applied for loans with Home Cred: No Previous App. - no previous applications for client found (i.e. new clients) All Accepted - all previous applications were accepted < 25% Rejected - less than 1/4 applications were rejected > 25% Rejected - more than 1/4 applications were rejected

In [ ]:
 
In [ ]:
features_matrix_with_bins["TotalDefaults_cats"].value_counts()
In [ ]:
features_matrix_with_bins["PrevRatioRejectedAccepted_cats"].value_counts()

Interestingly we can see that while applicants whose previous loans were rejected are significantly more likely to default when finally given a loan previous clients with no failed applications have a higher default risk than new clients.

This likely limits the usefulness of the previous_application table because only a small proportion of clients have previously rejected applications

In [ ]:
 
In [ ]:
 
In [ ]:
features_matrix["TotalDefaults"].describe()
In [ ]:
list(features_matrix_with_bins.columns)
In [ ]:
shared_utils.ANY_IMP_FEATURES
In [ ]:
graph.boxen_plots_by_category(
    source_df=features_matrix_with_bins,
    group_col="pass__purpose",
    target_col="pass__loan_amnt",
    title="Loan Amount by Purpose",
    x_label="Loan Amount",
)
In [ ]:
importlib.reload(graph)
for target_y in ["AmtCredit"]:
    for c in ["CodeGender", "DaysEmployed_binned", "NameEducationType", "OccupationType", "OwnCarAge_binned",
              "DaysBirth_binned", "NameFamilyStatus"
        , "NameFamilyStatus"]:
        # for c in features_matrix_with_bins.columns:
        #     if pd.api.types.is_numeric_dtype(features_matrix_with_bins[c]):
        #         continue
        #     
        #     for 
        display(graph.boxen_plot_by_cat(c, features_matrix_with_bins, target_y, drop_small_cats=True))
In [ ]: